# Classifier Training

***
## Notes

 - http://www.deeplearningbook.org/contents/guidelines.html
 - Determine goals:
 - Establish end-to-end working pipeline:
 - Instrument  the system to determine bottlenecks in performance. Diagnose which components are performing worse, find over/underfitting, defect in data, etc.
 - Repeatedly make incremental changes (gathering new data, adjusting hyperparameters, changing algorithms) based on findings from instrumentation

## Download exported tables locally for use with TensorFlow

In [None]:
# Authenticate Google Drive access
# following guide: https://gsuitedevs.github.io/PyDrive/docs/build/html/quickstart.html
# Need client_secrets.json file for auth to work -- I put it in 'notebooks' folder, but how can we put it somewhere more appropriate and still have the auth find it?
from pydrive.auth import GoogleAuth

gauth = GoogleAuth()
gauth.LocalWebserverAuth()

In [None]:
from pydrive.drive import GoogleDrive
drive = GoogleDrive(gauth)

# list files in root folder
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()

# Get file id of export folder we're saving tables to
exportFolderID = [file['id'] for file in file_list if file['title']=='GEE_exports'][0] 

# lists files in export folder
file_list = drive.ListFile({'q': "'" + exportFolderID + "' in parents and trashed=false"}).GetList()
for file1 in file_list:
    print('title: %s, id: %s' % (file1['title'], file1['id']))
    
# Save first file in list locally (check with print command above to ensure it's sample points)
file_list[0].GetContentFile('../Data/sample_points.csv')

## Playing with TensorFlow

In [None]:
import numpy as np
import tensorflow as tf

## Random Forest

## Neural Network

## SVM

In [11]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import GridSearchCV

In [129]:
# CSV of sample points with features
points = pd.read_csv('../Data/sample_points.csv')

df = points.copy()
dropCols = pd.concat([df.pop(x) for x in ['imageID', 'latitude','longitude']]) # Pop out imageID, lat, lon so we can check NaN and inf. 

# Check presence of infinite or NaN values
print(' Is NaN?:', np.any(np.isnan(df)), "\n",
     'Is inf?:', not np.any(np.isfinite(df)))

# Remove NaN
df = df.dropna()

# Recode landcover categories as dummy variables
# Probably want to remove impervious as well, or collapse some of the landcover types
nlcd_dict = {11:'open_water', # Dict of landcover values
             12:'perm_snowIce',
             21:'dev_openSpace',
             22:'dev_lowInt',
             23:'dev_medInt',
             24:'dev_highInt',
             31:'barren',
             41:'decid_forest',
             42:'evergreen_forest',
             43:'mixed_forest',
             51:'dwarf_scrub',
             52:'shrub',
             71:'grassland',
             72:'sedge',
             73:'lichens',
             74:'moss',
             81:'pasture',
             82:'crops',
             90:'woody_wetlands',
             95:'emergent_wetlands'}

df = df.replace({'landcover':nlcd_dict})

dummies = pd.get_dummies(df['landcover']).rename(columns=lambda x: str(x)) # Convert categorical feature to dummy vars
df = pd.concat([df, dummies], axis=1) # Concat with other features
df = df.drop(['landcover'], axis=1) # Drop categorical landcover feature
# df = df.reset_index()

# Rescale values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
# df_scaled = pd.concat([df_scaled, pd.DataFrame(imageID)]) # Add imageID back. Converting to NaN when I concat so leaving out for now

print(df_scaled)

# Separate features and target
Y = df.pop('flooded') # Target feature, drop from features
X = df # Features

# Split data into training and testing sets (split once, use it )
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

# Remove rows with NaN
X_train.dropna()
X_test.dropna()
y_train.dropna()
y_test.dropna()

print('Training data and target sizes: \n{}, {}'.format(X_train.shape,y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,y_test.shape))

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

# Grid of values for tuning hyperparameters
# C is the 

grid = [{'C': [0.0001, 0.001, 0.01, 0.1, 1.0],
         'tol': [0.0001, 0.001, 0.01, 0.1, 1.0]}]

model = LinearSVC()

gridcv = GridSearchCV(estimator=model, param_grid=grid, return_train_score=True)
gridcv.fit(X_train, y_train)
print('Support Vector Machine Best Score: ' + str(gridcv.best_score_))
print('Best C: ' + str(gridcv.best_estimator_.C))
print('Best tol: ' + str(gridcv.best_estimator_.tol))

 Is NaN?: True 
 Is inf?: False
        aspect     curve  elevation  flooded      hand  percent_tree_cover  \
0     0.109698  0.479518   0.165285      0.0  0.000000                0.00   
1     0.860799  0.453012   0.243296      0.0  0.006135                0.00   
2     0.777279  0.479518   0.464164      0.0  0.018405                0.00   
3     0.797801  0.450602   0.315456      0.0  0.177914                0.00   
4     0.412068  0.438554   0.303023      0.0  0.007669                0.00   
5     0.421486  0.462651   0.235739      0.0  0.032209                0.00   
6     0.751797  0.453012   0.209654      0.0  0.009202                0.00   
7     0.110680  0.443373   0.349586      0.0  0.010736                0.00   
8     0.860405  0.443373   0.223549      0.0  0.087423                0.00   
9     0.546616  0.481928   0.400536      0.0  0.219325                0.19   
10    0.402875  0.469880   0.174549      0.0  0.035276                0.00   
11    0.250599  0.453012   0.314

Support Vector Machine Best Score: 0.7813953488372093
Best C: 0.01
Best tol: 0.001


Tuning hyperparameters for SVM ... skipping this for now, getting error 'Input contains NaN, infinity or a value too large for dtype('float64').' from fitting a GridSearchCV object

In [130]:
clf = LinearSVC(C = gridcv.best_estimator_.C, 
                 tol=gridcv.best_estimator_.tol,
                 random_state=0)
clf.fit(X_train, y_train)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.001, verbose=0)

In [None]:
# Performance metrics
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(y_test, y_pred)))

In [None]:
# # Benchmark training time
# start_time1 = time.time()
# time1 = str(round(time.time() - start_time1, 2))
# print(time1 + " seconds")