# Classifier Training

***
## Notes

 - http://www.deeplearningbook.org/contents/guidelines.html
 - Determine goals:
 - Establish end-to-end working pipeline:
 - Instrument  the system to determine bottlenecks in performance. Diagnose which components are performing worse, find over/underfitting, defect in data, etc.
 - Repeatedly make incremental changes (gathering new data, adjusting hyperparameters, changing algorithms) based on findings from instrumentation

## Download exported tables locally for use with TensorFlow

In [None]:
# Authenticate Google Drive access
# following guide: https://gsuitedevs.github.io/PyDrive/docs/build/html/quickstart.html
# Need client_secrets.json file for auth to work -- I put it in 'notebooks' folder, but how can we put it somewhere more appropriate and still have the auth find it?
from pydrive.auth import GoogleAuth

gauth = GoogleAuth()
gauth.LocalWebserverAuth()

In [None]:
from pydrive.drive import GoogleDrive
drive = GoogleDrive(gauth)

# list files in root folder
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()

# Get file id of export folder we're saving tables to
exportFolderID = [file['id'] for file in file_list if file['title']=='GEE_exports'][0] 

# lists files in export folder
file_list = drive.ListFile({'q': "'" + exportFolderID + "' in parents and trashed=false"}).GetList()
for file1 in file_list:
    print('title: %s, id: %s' % (file1['title'], file1['id']))
    
# Save first file in list locally (check with print command above to ensure it's sample points)
file_list[0].GetContentFile('../Data/sample_points.csv')

## Playing with TensorFlow

In [None]:
import numpy as np
import tensorflow as tf

## Random Forest

## Neural Network

## SVM

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import GridSearchCV

In [2]:
# CSV of sample points with features
points = pd.read_csv('../Data/sample_points.csv')

# points.head()
list(points.columns)

# Recode landcover categories as dummy variables
# Probably want to remove impervious as well, or collapse some of the landcover types
nlcd_dict = {11:'open_water', # Dict of landcover values
             12:'perm_snowIce',
             21:'dev_openSpace',
             22:'dev_lowInt',
             23:'dev_medInt',
             24:'dev_highInt',
             31:'barren',
             41:'decid_forest',
             42:'evergreen_forest',
             43:'mixed_forest',
             51:'dwarf_scrub',
             52:'shrub',
             71:'grassland',
             72:'sedge',
             73:'lichens',
             74:'moss',
             81:'pasture',
             82:'crops',
             90:'woody_wetlands',
             95:'emergent_wetlands'}

df = points.replace({'landcover':nlcd_dict})

dummies = pd.get_dummies(df['landcover']).rename(columns=lambda x: str(x)) # Convert categorical feature to dummy vars
df = pd.concat([df, dummies], axis=1) # Concat with other features
df = df.drop(['landcover'], axis=1) # Drop categorical landcover feature
df = df.reset_index()

df['spi'].clip(1e-3,1e3, inplace=True)
max(df['spi'])

# Separate features and target
Y = df.pop('flooded') # Target feature, drop from features
X = df # Features

# Split data into training and testing sets (split once, use it )
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

# Remove rows with NaN
X_train.dropna()
X_test.dropna()
y_train.dropna()
y_test.dropna()

# Remove features not needed for training
# train_drop = pd.concat([X_train.pop(x) for x in ['imageID', 'latitude','longitude', 'spi', 'aspect', 'elevation', 'hand', 'percent_tree_cover','impervious', 'slope', 'twi', 'barren', 'crops', 'decid_forest', 'dev_highInt', 'dev_lowInt','dev_medInt', 'dev_openSpace', 'emergent_wetlands', 'evergreen_forest','grassland', 'mixed_forest', 'open_water', 'pasture', 'shrub','woody_wetlands']])
train_drop = pd.concat([X_train.pop(x) for x in ['imageID', 'latitude','longitude']])
test_drop = pd.concat([X_test.pop(x) for x in ['imageID', 'latitude','longitude']])

X_train = X_train.reset_index()
X_test = X_test.reset_index()


print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

Training data and target sizes: 
(6700, 26), (6700,)
Test data and target sizes: 
(3300, 26), (3300,)


Tuning hyperparameters for SVM ... skipping this for now, getting error 'Input contains NaN, infinity or a value too large for dtype('float64').' from fitting a GridSearchCV object

In [3]:
np.isnan(X_train.any()) #and gets False

In [4]:
np.isfinite(X_train.all()) #and gets True

In [5]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

# Grid of values for tuning hyperparameters
# C is the 
grid = [{'C': [0.0001, 0.001, 0.01, 0.1, 1.0],
         'tol': [0.0001, 0.001, 0.01, 0.1, 1.0]}]

model = LinearSVC()

gridcv = GridSearchCV(estimator=model, param_grid=grid, return_train_score=True)
gridcv.fit(X_train, y_train)
print('Support Vector Machine Best Score: ' + str(gridcv.best_score_))
print('Best C: ' + str(gridcv.best_estimator_.C))
print('Best tol: ' + str(gridcv.best_estimator_.tol))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
clf = LinearSVC(C = 0.01, 
                 tol = 0.01,
                 random_state = 32)
clf.fit(X_train, y_train)

In [None]:
# Benchmark training time
start_time1 = time.time()
time1 = str(round(time.time() - start_time1, 2))
print(time1 + " seconds")